from Bio import SeqIO
import shutil
import os
import re
import sys
import subprocess
from subprocess import*
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

def CreateMapping(ref_fas,prof_fas, ctf_ct):
    recs=[]
    seq_before=""
    seq_after=""
    id_before=""
    id_after=""
    #h=open(ref_fas,'r')]
    h=open(ref_fas,'r')
    for i in SeqIO.parse(h,'fasta'): #Getting the sequence used for by NASP for secondary structure prediction
        mstring=str(i.id)
        if mstring.find("Reference")!=-1:
            id_before=str(i.id)
            seq_before=str(i.seq)
    h.close()
    
    h1=open(prof_fas,"r")
    for i in SeqIO.parse(h1,"fasta"):
        if str(i.id)==(id_before):
            seq_after=str(i.seq)
    
    rec_len=len(seq_before)
    rec_list=[0*i for i in range(rec_len)]              #Getting the paired position from the NASP ct files
    file_data=[]
    h_ct=open(ctf_ct,"r")
    file_data=h_ct.readlines()
    for i in range(len(file_data)):
        [3]
        if "ENERGY" in file_data[i]:
            continue
        elif int(file_data[i].split("\t")[4])!=0:
   
            rec_list[int(file_data[i].split("\t")[4])-1]=1
                  
    h_ct.close()

    newMapp=[]                    #mapping the paired position to the profile alignment
    index=0
    for i in range(len(seq_after)):
        if seq_after[i]=="-":
            newMapp.append(-1)
        else:
            newMapp.append(rec_list[index])
            index+=1
    return newMapp
	
def RenameSeq():
    ls_f_fas=[]
    ls_f=os.listdir('.')
    for i in ls_f:
        if i.endswith(".fas"):
            ls_f_fas.append(i)
    for f in ls_f_fas:
        in_lines=[]
        h_in=open(f,"r")
        in_lines=h_in.readlines()
        out_name="E_"+f
        h_out=open(out_name,"w")
        seq=1
        for i in in_lines:
            if i.startswith(">"):
                h_out.write(">Seq00"+str(seq)+"\n")
                seq+=1
                #h_out.write(">"+i.split("|")[-2]+"\n")				
            else:
                h_out.write(i)
        h_in.close()
        h_out.close()
        print f, " ",out_name
		
def ConvFastaToPhy(file_names_list):#input a list of file names
    phylip_files_saved=[]
    for i in range(len(file_names_list)):
        print file_names_list[i]
        conv_name=file_names_list[i].split('.')[0]
        name_of_file=conv_name+'.phy'
        phylip_files_saved.append(name_of_file)
        print file_names_list[i], name_of_file
        count=SeqIO.convert(file_names_list[i],'fasta', name_of_file, "phylip")
    return phylip_files_saved
	
def ConvAllFas2Phy():
    fas_files=[]
    f=os.listdir(".")
    for i in f:
        if i.startswith("paired") and i.endswith(".fas"):
            fas_files.append(i)   
    phy_files=ConvFastaToPhy(fas_files)
    return phy_files
	
def get_paired_unpaired(fas_file,mappin):
    #Read the fas files 
    fas_rec=[]
    h_fas=open(fas_file)
    for rec in SeqIO.parse(h_fas, "fasta"):
        fas_rec.append(rec)
            
    #split alignment into paire and unpaired sites
    paired_recs=[""*i for i in range(len(fas_rec))]
    unpaired_recs=[""*i for i in range(len(fas_rec))]
    for i in range(len(mappin)):
        if mappin[i]==1:
            for j in range(len(fas_rec)):
                paired_recs[j]=paired_recs[j]+fas_rec[j].seq[i]
        elif mappin[i]==0:
            for j in range(len(fas_rec)):
                unpaired_recs[j]=unpaired_recs[j]+fas_rec[j].seq[i]
        elif mappin[i]==-1:
            continue
    #save the unpaired paired fasta files
    for i in range(len(paired_recs)):
        fas_rec[i].seq=Seq(paired_recs[i], IUPAC.unambiguous_dna)
        
		
		#fas_rec[i].id=fas_file.split("/")[5]+str(i)
    name= "paired_"+fas_file
    output1=open(name,'w')
    SeqIO.write(fas_rec,output1,"fasta") 
    output1.close()   
 
    
    for i in range(len(unpaired_recs)):
        fas_rec[i].seq=Seq(unpaired_recs[i], IUPAC.unambiguous_dna)
        #fas_rec[i].id=fas_file.split("/")[5]+str(i)
       
    name= "unpaired_"+fas_file
    output2=open(name,'w')
    SeqIO.write(fas_rec,output2,"fasta") 
    output2.close()	

def Profile_Algn(import_in1fas,import_in2fas):

	print import_in1fas
	print import_in2fas
	
	Ref_Fas_Name="ref_"+import_in1fas
	Full_Fas_Name="full_"+import_in2fas
	Prof_Fas_Name="prof_"+import_in2fas
	
	print "performing the profile alignment"
	h1list=[]
	h2list=[]
	h1=open(import_in1fas,"r")
	h2=open(import_in2fas,"r")
	for rec in SeqIO.parse(h1,"fasta"):
		seq_mut=rec.seq.tomutable()
		for j in range(len(seq_mut)):
			if seq_mut[j]=="-":
				seq_mut[j]="N"
		rec.seq=seq_mut
		h1list.append(rec)
	h1.close()
	
	
	for rec1 in SeqIO.parse(h2,"fasta"):
		seq_mut1=rec1.seq.tomutable()
		for j in range(len(seq_mut1)):
			if seq_mut1[j]=="-":
				seq_mut1[j]="N"
		rec1.seq=seq_mut1
		h2list.append(rec1)
	h2.close()
	
	  
	hrefout1=open(Ref_Fas_Name,"w")
	h1list[0].id="Reference"
	SeqIO.write(h1list,hrefout1,"fasta")                        
	hrefout1.close()   
	hrefout2=open(Full_Fas_Name,"w")
	SeqIO.write(h2list,hrefout2,"fasta")                        
	hrefout2.close()   
	
	#cmd=['muscle3.8.31_i86win32.exe','-profile','-in1', Full_Fas_Name, '-in2',Ref_Fas_Name,'-out', Prof_Fas_Name] #for windows
	#cmd=['./muscle3.8.31_i86darwin32','-profile','-in1', Full_Fas_Name, '-in2',Ref_Fas_Name,'-out', Prof_Fas_Name] #for MAC
	cmd=['./muscle3.8.31_i86linux32','-profile','-in1', Full_Fas_Name, '-in2',Ref_Fas_Name,'-out', Prof_Fas_Name] #for Linux
	
	print cmd 
	proc=Popen(cmd,stdout=PIPE,stderr=PIPE)
	(output, error)=proc.communicate()
	return_code = proc.wait()
	if return_code != 0:
		sys.stderr.write('Error occured running Muscle')
	else:           
		print "profile alignment completed"
	PATH_IN_CT=import_in1fas.split(".")[0]+"_0.05.ct"
	#create mapp
	
	print "ccc " + import_in1fas
	print "CT "+ PATH_IN_CT
	mapp=CreateMapping(Ref_Fas_Name,Prof_Fas_Name,PATH_IN_CT) 
	#split data set into paired and unpaired 
	get_paired_unpaired(Prof_Fas_Name,mapp)
	os.remove(Ref_Fas_Name)
	os.remove(Full_Fas_Name)
	print "Done! Check the paired and unpaired data sets."
	

#================================================== Main Program ==================================================
Full_fas=[i for i in os.listdir(".") if (i.endswith('_full.fas') and not (i.startswith("prof") or i.startswith("unpaired") or i.startswith("paired") or i.startswith("ref")))] #Input alignment should end with "_full.fas" and their corresponding ct files from NASP should have "_0.05.ct" 

#Profile_Algn("DNA-C_14seqs.fas","DNA-C_full.fas")

for i in Full_fas:
	t=i.split("_")[0]+"_14seqs.fas"
	print i, t
	Profile_Algn(t,i) #Imports the fasta files (NASP and full), replaces gapps with "N" and does the profile alignment

